maftools : Summarize, Analyze and Visualize MAF Files (Mayakonda et al. 2018)

6. Reading and summarizing maf files

read MAF files and summerizes in different ways as well as making MAF an object.

library(maftools)
clinical_data = read.table("tcga_laml_annot.tsv", sep="\t", header = TRUE) #read in metadata
laml_maf = read.maf(maf = "tcga_laml.maf", clinicalData = clinical_data) #create a MAF object
## -Reading
## -Validating
## -Silent variants: 475 
## -Summarizing
## -Processing clinical data
## -Finished in 0.195s elapsed (0.346s cpu)
laml_df = subsetMaf(laml_maf, mafObj=FALSE) #creates a data.frame of laml.maf - this can be viewed as a table 
APL_prim = read.table("APL_primary.maf.gz", sep="\t", header = TRUE) #read 
APL_relap = read.table("APL_relapse.maf.gz", sep="\t", header = TRUE) #read 
laml_annot= read.table("tcga_laml_annot.tsv", sep="\t", header = TRUE) #read 
LAML_sig = read.table("LAML_sig_genes.txt.gz", sep="\t", header = TRUE) #read 
laml_maf #Summary of laml_maf file
## An object of class  MAF 
##                    ID          summary  Mean Median
##  1:        NCBI_Build               37    NA     NA
##  2:            Center genome.wustl.edu    NA     NA
##  3:           Samples              193    NA     NA
##  4:            nGenes             1241    NA     NA
##  5:   Frame_Shift_Del               52 0.271      0
##  6:   Frame_Shift_Ins               91 0.474      0
##  7:      In_Frame_Del               10 0.052      0
##  8:      In_Frame_Ins               42 0.219      0
##  9: Missense_Mutation             1342 6.990      7
## 10: Nonsense_Mutation              103 0.536      0
## 11:       Splice_Site               92 0.479      0
## 12:             total             1732 9.021      9
getSampleSummary(laml_maf) #Sample summary
##      Tumor_Sample_Barcode Frame_Shift_Del Frame_Shift_Ins In_Frame_Del
##   1:         TCGA-AB-3009               0               5            0
##   2:         TCGA-AB-2807               1               0            1
##   3:         TCGA-AB-2959               0               0            0
##   4:         TCGA-AB-3002               0               0            0
##   5:         TCGA-AB-2849               0               1            0
##  ---                                                                  
## 188:         TCGA-AB-2933               0               0            0
## 189:         TCGA-AB-2942               0               0            0
## 190:         TCGA-AB-2946               0               0            0
## 191:         TCGA-AB-2954               0               0            0
## 192:         TCGA-AB-2982               0               0            0
##      In_Frame_Ins Missense_Mutation Nonsense_Mutation Splice_Site total
##   1:            1                25                 2           1    34
##   2:            0                16                 3           4    25
##   3:            0                22                 0           1    23
##   4:            0                15                 1           5    21
##   5:            0                16                 1           2    20
##  ---                                                                   
## 188:            0                 1                 0           0     1
## 189:            1                 0                 0           0     1
## 190:            0                 1                 0           0     1
## 191:            0                 1                 0           0     1
## 192:            0                 1                 0           0     1
getGeneSummary(laml_maf) #Gene summary
##       Hugo_Symbol Frame_Shift_Del Frame_Shift_Ins In_Frame_Del In_Frame_Ins
##    1:        FLT3               0               0            1           33
##    2:      DNMT3A               4               0            0            0
##    3:        NPM1               0              33            0            0
##    4:        IDH2               0               0            0            0
##    5:        IDH1               0               0            0            0
##   ---                                                                      
## 1237:      ZNF689               0               0            0            0
## 1238:      ZNF75D               0               0            0            0
## 1239:      ZNF827               1               0            0            0
## 1240:       ZNF99               0               0            0            0
## 1241:        ZPBP               0               0            0            0
##       Missense_Mutation Nonsense_Mutation Splice_Site total MutatedSamples
##    1:                15                 0           3    52             52
##    2:                39                 5           6    54             48
##    3:                 1                 0           0    34             33
##    4:                20                 0           0    20             20
##    5:                18                 0           0    18             18
##   ---                                                                     
## 1237:                 1                 0           0     1              1
## 1238:                 1                 0           0     1              1
## 1239:                 0                 0           0     1              1
## 1240:                 1                 0           0     1              1
## 1241:                 1                 0           0     1              1
##       AlteredSamples
##    1:             52
##    2:             48
##    3:             33
##    4:             20
##    5:             18
##   ---               
## 1237:              1
## 1238:              1
## 1239:              1
## 1240:              1
## 1241:              1
getFields(laml_maf) #Shows all fiedls in maf file
##  [1] "Hugo_Symbol"            "Entrez_Gene_Id"         "Center"                
##  [4] "NCBI_Build"             "Chromosome"             "Start_Position"        
##  [7] "End_Position"           "Strand"                 "Variant_Classification"
## [10] "Variant_Type"           "Reference_Allele"       "Tumor_Seq_Allele1"     
## [13] "Tumor_Seq_Allele2"      "Tumor_Sample_Barcode"   "Protein_Change"        
## [16] "i_TumorVAF_WU"          "i_transcript_name"
write.mafSummary(maf= laml_maf, basename = "laml_maf") #makes an output file of the laml_maf basename

7. Visualization

Plots the summary of the MAF file in several ways such as boxplot by variant class, barplot of eah sample. Oncoplots shows all samples and what genes are altered. Oncostrip shows the percent of mutatations in each gene in all the samples. Transitions and transversions can be looked at boxplots and percent mutations. Lollipop plots show mutation areas of the proteins at particular hotspots. Rainfall plots show hyper mutated regions using inter variant distance. Now compare mutations to 30 different TCGA cancer cohorts. Plotting the variant allele frequencies shows the top mutated genes that have 50% allele frequency. A genecloud plot show the top mutated gene.

plotmafSummary(maf = laml_maf, rmOutlier = TRUE, addStat = 'median', dashboard = TRUE, titvRaw = FALSE) #plots the laml_maf summary file 

oncoplot(maf = laml_maf, top = 10) #plots top n=10 mutated genes

oncostrip(maf= laml_maf, genes = c('DNMT3A', 'NPM1', 'RUNX1')) #plots specific genes

laml.titv =  titv(maf =laml_maf, plot = FALSE, useSyn = TRUE) #seperates SNPS into transitions and transversions then produces a table with the summary
plotTiTv(res = laml.titv) #plots titv file 

lollipopPlot(maf = laml_maf, gene = 'DNMT3A', AACol = 'Protein_Change', showMutationRate = TRUE) #plots mutation spots on protien strucures using the AACol input 
## 3 transcripts available. Use arguments refSeqID or proteinID to manually specify tx name.
##      HGNC refseq.ID protein.ID aa.length
## 1: DNMT3A NM_175629  NP_783328       912
## 2: DNMT3A NM_022552  NP_072046       912
## 3: DNMT3A NM_153759  NP_715640       723
## Using longer transcript NM_175629 for now.
## Removed 3 mutations for which AA position was not available

lollipopPlot(maf=laml_maf, gene= 'KIT', AACol = 'Protein_Change', labelPos=816, refSeqID = 'NM_000222') #look at a label positon 816

brca <- system.file('extdata', 'brca.maf.gz', package = "maftools")
brca = read.maf(maf = brca, verbose = FALSE)
rainfallPlot(maf=brca, detectChangePoints = TRUE, pointSize = 0.6) #plots hypermutated genomic regions and higlights areas that may change inter-event distance
## Processing TCGA-A8-A08B..
## Kataegis detected at:
##    Chromosome Start_Position End_Position nMuts Avg_intermutation_dist Size
## 1:          8       98129391     98133560     6               833.8000 4169
## 2:          8       98398603     98403536     8               704.7143 4933
## 3:          8       98453111     98456466     8               479.2857 3355
## 4:          8      124090506    124096810    21               315.2000 6304
## 5:         12       97437934     97439705     6               354.2000 1771
## 6:         17       29332130     29336153     7               670.5000 4023
##    Tumor_Sample_Barcode C>G C>T
## 1:         TCGA-A8-A08B   4   2
## 2:         TCGA-A8-A08B   1   7
## 3:         TCGA-A8-A08B  NA   8
## 4:         TCGA-A8-A08B   1  20
## 5:         TCGA-A8-A08B   3   3
## 6:         TCGA-A8-A08B   4   3

laml_mutload= tcgaCompare(maf = laml_maf, cohortName= 'Example-LAML') #Compares mutation load to input MAF file 

# 7. continued

plotVaf(maf = laml_maf, vafCol = 'i_TumorVAF_WU') #Plots variant allele frequency to calculate the clonal status of the top mutated genes 

geneCloud(input = laml_maf, minMut = 3) #Makes a word cloud withthe mutated genes with a minimum number of 3 samples the gene is mutated 

8. Processing copy-number data

Now read and summarize the GISTIC files using a genome plot, bubble plot, and an oncoplot. Then look at CBS segments

#summarize output from the GISTIC programe 
all.lesions <- system.file("extdata", "all_lesions.conf_99.txt", package = "maftools")
amp.genes <- system.file("extdata", "amp_genes.conf_99.txt", package = "maftools")
del.genes <- system.file("extdata", "del_genes.conf_99.txt", package = "maftools")
scores.gis <- system.file("extdata", "scores.gistic", package = "maftools")
laml.gistic = readGistic(gisticAllLesionsFile = all.lesions, gisticAmpGenesFile = amp.genes, gisticDelGenesFile = del.genes, gisticScoresFile = scores.gis, isTCGA = TRUE)
## -Processing Gistic files..
## --Processing amp_genes.conf_99.txt
## --Processing del_genes.conf_99.txt
## --Processing scores.gistic
## --Summarizing by samples
laml.gistic #GISTIC object 
## An object of class  GISTIC 
##           ID summary
## 1:   Samples     191
## 2:    nGenes    2622
## 3: cytoBands      16
## 4:       Amp     388
## 5:       Del   26481
## 6:     total   26869
gisticChromPlot(gistic = laml.gistic, markBands = "all") #plots GISTIC data 

gisticBubblePlot(gistic = laml.gistic) # plots as GISTIC data as a bubble plot

gisticOncoPlot(gistic = laml.gistic, clinicalData = getClinicalData(x = laml_maf), clinicalFeatures = 'FAB_classification', sortByAnnotation = TRUE, top = 10) # plots GISTIC as onoplots 

tcga.ab.009.seg <- system.file("extdata", "TCGA.AB.3009.hg19.seg.txt", package = "maftools")
plotCBSsegments(cbsFile = tcga.ab.009.seg) #plots CBS segments 
## NULL

# 9. Analysis (Tamborero, Gonzalez-Perez, and Lopez-Bigas 2013)

Now we can perform analysis. First look at sommatic interactions of the top 25 genes. Now detect cancer driver genes by looking at postional clusters. Now add pfam domains to the amino acid changes. Now, compare to the 21 cancer cohorts with ~200 genes with mutated significant results. Then predict the survivial genes by removing those with poor survivial. After, compare the two maf cohorts, primary and relpase to see if and what genes were altered. Then, make forest plots of PML and RAR genes. These genes had many mutations in the replapse cohort. Anohter way to show this data is by plotting co-onco plots side by side of the two cohorts of the top genes. For more detail plot a lollipop plot-2. Now, perform a clinical enrichment analysis. This looks at clinical features that are associated with samples. This shows enriched mutations in muliple categories. Then, look at the drug gene interactions to see the druggability based on the drug gene interaction database. Now, look at the oncogenic pathways in the TCGA cohorts. Display the complete patways for a better understanding. Finally look at mutations signatures/pattern of neuclotide subsitutions. These signatures are then compared to validated signatures. Do do this first extract the adjacent bases of the mutated base to make a mutation matrix. Then estimate the APOBEC enrichement scores. Now, look at the differences between APOBEC and non-enriched. Finally, performe signature analysis. To do this, estimate the signatures, find the optimal number of signatures, extract them, compare to the known signatures, and plot them.

library(devtools)
## Loading required package: usethis
library(maftools)
somaticInteractions(maf = laml_maf, top = 25, pvalue = c(0.05, 0.1)) #performs somatic interaction analysis to see if mutation is mutally exclusive or co-occuring using the pair-wise Fisher's Exact test 

##      gene1  gene2       pValue oddsRatio  00 11 01 10              Event
##   1: ASXL1  RUNX1 0.0001541586 55.215541 176  4 12  1       Co_Occurence
##   2:  IDH2  RUNX1 0.0002809928  9.590877 164  7  9 13       Co_Occurence
##   3:  IDH2  ASXL1 0.0004030636 41.077327 172  4  1 16       Co_Occurence
##   4:  FLT3   NPM1 0.0009929836  3.763161 125 17 16 35       Co_Occurence
##   5:  SMC3 DNMT3A 0.0010451985 20.177713 144  6 42  1       Co_Occurence
##  ---                                                                    
## 296: PLCE1  ASXL1 1.0000000000  0.000000 184  0  5  4 Mutually_Exclusive
## 297: RAD21  FAM5C 1.0000000000  0.000000 183  0  5  5 Mutually_Exclusive
## 298: PLCE1  FAM5C 1.0000000000  0.000000 184  0  5  4 Mutually_Exclusive
## 299: PLCE1  RAD21 1.0000000000  0.000000 184  0  5  4 Mutually_Exclusive
## 300:  EZH2  PLCE1 1.0000000000  0.000000 186  0  4  3 Mutually_Exclusive
##              pair event_ratio
##   1: ASXL1, RUNX1        4/13
##   2:  IDH2, RUNX1        7/22
##   3:  ASXL1, IDH2        4/17
##   4:   FLT3, NPM1       17/51
##   5: DNMT3A, SMC3        6/43
##  ---                         
## 296: ASXL1, PLCE1         0/9
## 297: FAM5C, RAD21        0/10
## 298: FAM5C, PLCE1         0/9
## 299: PLCE1, RAD21         0/9
## 300:  EZH2, PLCE1         0/7
laml.sig = oncodrive(maf = laml_maf, AACol = 'Protein_Change', minMut = 5, pvalMethod = 'zscore')
## Estimating background scores from synonymous variants..
## Not enough genes to build background. Using predefined values. (Mean = 0.279; SD = 0.13)
## Estimating cluster scores from non-syn variants..
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |=================================                                     |  48%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |======================================================================| 100%
## Comapring with background model and estimating p-values..
## Done !
head(laml.sig) # Finds driver cancer genes from MAF file, usually in certain "hot spots" 
##    Hugo_Symbol Frame_Shift_Del Frame_Shift_Ins In_Frame_Del In_Frame_Ins
## 1:        IDH1               0               0            0            0
## 2:        IDH2               0               0            0            0
## 3:        NPM1               0              33            0            0
## 4:        NRAS               0               0            0            0
## 5:       U2AF1               0               0            0            0
## 6:         KIT               1               1            0            1
##    Missense_Mutation Nonsense_Mutation Splice_Site total MutatedSamples
## 1:                18                 0           0    18             18
## 2:                20                 0           0    20             20
## 3:                 1                 0           0    34             33
## 4:                15                 0           0    15             15
## 5:                 8                 0           0     8              8
## 6:                 7                 0           0    10              8
##    AlteredSamples clusters muts_in_clusters clusterScores protLen   zscore
## 1:             18        1               18     1.0000000     414 5.546154
## 2:             20        2               20     1.0000000     452 5.546154
## 3:             33        2               32     0.9411765     294 5.093665
## 4:             15        2               15     0.9218951     189 4.945347
## 5:              8        1                7     0.8750000     240 4.584615
## 6:              8        2                9     0.8500000     976 4.392308
##            pval          fdr fract_muts_in_clusters
## 1: 1.460110e-08 1.022077e-07              1.0000000
## 2: 1.460110e-08 1.022077e-07              1.0000000
## 3: 1.756034e-07 8.194826e-07              0.9411765
## 4: 3.800413e-07 1.330144e-06              1.0000000
## 5: 2.274114e-06 6.367520e-06              0.8750000
## 6: 5.607691e-06 1.308461e-05              0.9000000
plotOncodrive(res = laml.sig, fdrCutOff = 0.1, useFraction = TRUE) #plot Oncodrive clusters depending on the cluster size

laml.pfam = pfamDomains(maf = laml_maf, AACol = 'Protein_Change', top = 10) #pfamDomains adds domain informaiton to amino acid changes and summarizes acording to affected doamins
## Warning in pfamDomains(maf = laml_maf, AACol = "Protein_Change", top = 10):
## Removed 50 mutations for which AA position was not available

laml.pfam$proteinSummary[,1:7, with = FALSE] #shows the domain labe of with frequent mutations 
##         HGNC AAPos Variant_Classification  N total  fraction   DomainLabel
##    1: DNMT3A   882      Missense_Mutation 27    54 0.5000000 AdoMet_MTases
##    2:   IDH1   132      Missense_Mutation 18    18 1.0000000      PTZ00435
##    3:   IDH2   140      Missense_Mutation 17    20 0.8500000      PTZ00435
##    4:   FLT3   835      Missense_Mutation 14    52 0.2692308      PKc_like
##    5:   FLT3   599           In_Frame_Ins 10    52 0.1923077      PKc_like
##   ---                                                                     
## 1512: ZNF646   875      Missense_Mutation  1     1 1.0000000          <NA>
## 1513: ZNF687   554      Missense_Mutation  1     2 0.5000000          <NA>
## 1514: ZNF687   363      Missense_Mutation  1     2 0.5000000          <NA>
## 1515: ZNF75D     5      Missense_Mutation  1     1 1.0000000          <NA>
## 1516: ZNF827   427        Frame_Shift_Del  1     1 1.0000000          <NA>
laml.mutsig <- system.file("extdata", "LAML_sig_genes.txt.gz", package = "maftools")  
pancanComparison(mutsigResults = laml.mutsig, qval = 0.1, cohortName = 'LAML', inputSampleSize = 200, label = 1) #compare mutSig results with pan=can list
## Significantly mutated genes in LAML (q < 0.1): 23
## Significantly mutated genes in PanCan cohort (q <0.1): 114
## Significantly mutated genes exclusive to LAML (q < 0.1):
##       gene pancan            q nMut log_q_pancan     log_q
##  1:  CEBPA  1.000 3.500301e-12   13   0.00000000 11.455895
##  2:   EZH2  1.000 7.463546e-05    3   0.00000000  4.127055
##  3: GIGYF2  1.000 6.378338e-03    2   0.00000000  2.195292
##  4:    KIT  0.509 1.137517e-05    8   0.29328222  4.944042
##  5:   PHF6  0.783 6.457555e-09    6   0.10623824  8.189932
##  6: PTPN11  0.286 7.664584e-03    9   0.54363397  2.115511
##  7:  RAD21  0.929 1.137517e-05    5   0.03198429  4.944042
##  8:  SMC1A  0.801 2.961696e-03    6   0.09636748  2.528460
##  9:   TET2  0.907 2.281625e-13   17   0.04239271 12.641756
## 10:    WT1  1.000 2.281625e-13   12   0.00000000 12.641756

##          gene   pancan            q nMut log_q_pancan    log_q
##   1:   ACVR1B 6.11e-02 1.000000e+00    0     1.213959  0.00000
##   2:     AKT1 2.68e-10 1.000000e+00    0     9.571865  0.00000
##   3:      APC 1.36e-13 1.000000e+00    0    12.866461  0.00000
##   4:    APOL2 7.96e-03 1.000000e+00    0     2.099087  0.00000
##   5: ARHGAP35 2.32e-12 1.000000e+00    1    11.634512  0.00000
##  ---                                                          
## 120:    U2AF1 4.07e-08 4.503311e-13    8     7.390406 12.34647
## 121:      VHL 2.32e-12 1.000000e+00    0    11.634512  0.00000
## 122:      WT1 1.00e+00 2.281625e-13   12     0.000000 12.64176
## 123:   ZNF180 8.60e-02 1.000000e+00    0     1.065502  0.00000
## 124:   ZNF483 2.37e-02 1.000000e+00    0     1.625252  0.00000
mafSurvival(maf = laml_maf, genes = 'DNMT3A', time = 'days_to_last_followup', Status = 'Overall_Survival_Status', isTCGA = TRUE) #surviival analysis and draws curve based on sample grouping mutaiton status 
## Looking for clinical data in annoatation slot of MAF..
## Number of mutated samples for given genes:
## DNMT3A 
##     48
## Removed 11 samples with NA's
## Median survival..
##     Group medianTime   N
## 1: Mutant        245  45
## 2:     WT        396 137

prog_geneset = survGroup(maf = laml_maf, top = 20, geneSetSize = 2, time = "days_to_last_followup", Status = "Overall_Survival_Status", verbose = FALSE) #Finds genes that have poor survival (top 20 of size 2)
## Removed 11 samples with NA's
print(prog_geneset) #prints poor survivial genes (11)
##     Gene_combination P_value    hr  WT Mutant
##  1:      FLT3_DNMT3A 0.00104 2.510 164     18
##  2:      DNMT3A_SMC3 0.04880 2.220 176      6
##  3:      DNMT3A_NPM1 0.07190 1.720 166     16
##  4:      DNMT3A_TET2 0.19600 1.780 176      6
##  5:        FLT3_TET2 0.20700 1.860 177      5
##  6:        NPM1_IDH1 0.21900 0.495 176      6
##  7:      DNMT3A_IDH1 0.29300 1.500 173      9
##  8:       IDH2_RUNX1 0.31800 1.580 176      6
##  9:        FLT3_NPM1 0.53600 1.210 165     17
## 10:      DNMT3A_IDH2 0.68000 0.747 178      4
## 11:      DNMT3A_NRAS 0.99200 0.986 178      4
mafSurvGroup(maf = laml_maf, geneSet = c("DNMT3A", "FLT3"), time = "days_to_last_followup", Status = "Overall_Survival_Status") #surviival analysis and draws curve based on sample grouping mutaiton status without 11 genes
## Looking for clinical data in annoatation slot of MAF..
## Removed 11 samples with NA's
## Median survival..
##     Group medianTime   N
## 1: Mutant      242.5  18
## 2:     WT      379.5 164

primary.apl = system.file("extdata", "APL_primary.maf.gz", package = "maftools") 
primary.apl = read.maf(maf = primary.apl)
## -Reading
## -Validating
## --Non MAF specific values in Variant_Classification column:
##   ITD
## -Silent variants: 45 
## -Summarizing
## -Processing clinical data
## --Missing clinical data
## -Finished in 0.044s elapsed (0.079s cpu)
relapse.apl = system.file("extdata", "APL_relapse.maf.gz", package = "maftools")
relapse.apl = read.maf(maf = relapse.apl)
## -Reading
## -Validating
## --Non MAF specific values in Variant_Classification column:
##   ITD
## -Silent variants: 19 
## -Summarizing
## -Processing clinical data
## --Missing clinical data
## -Finished in 0.038s elapsed (0.069s cpu)
pt.vs.rt <- mafCompare(m1 = primary.apl, m2 = relapse.apl, m1Name = 'Primary', m2Name = 'Relapse', minMut = 5)
print(pt.vs.rt) #Compares primary vs relpased to see if there is any differentially mutated genes by performing fisher test on the the differentially mutated genes from both groups 
## $results
##    Hugo_Symbol Primary Relapse         pval         or       ci.up      ci.low
## 1:         PML       1      11 1.529935e-05 0.03537381   0.2552937 0.000806034
## 2:        RARA       0       7 2.574810e-04 0.00000000   0.3006159 0.000000000
## 3:       RUNX1       1       5 1.310500e-02 0.08740567   0.8076265 0.001813280
## 4:        FLT3      26       4 1.812779e-02 3.56086275  14.7701728 1.149009169
## 5:      ARID1B       5       8 2.758396e-02 0.26480490   0.9698686 0.064804160
## 6:         WT1      20      14 2.229087e-01 0.60619329   1.4223101 0.263440988
## 7:        KRAS       6       1 4.334067e-01 2.88486293 135.5393108 0.337679367
## 8:        NRAS      15       4 4.353567e-01 1.85209500   8.0373994 0.553883512
## 9:      ARID1A       7       4 7.457274e-01 0.80869223   3.9297309 0.195710173
##         adjPval
## 1: 0.0001376942
## 2: 0.0011586643
## 3: 0.0393149868
## 4: 0.0407875250
## 5: 0.0496511201
## 6: 0.3343630535
## 7: 0.4897762916
## 8: 0.4897762916
## 9: 0.7457273717
## 
## $SampleSummary
##     Cohort SampleSize
## 1: Primary        124
## 2: Relapse         58
#par(mar=c(1,1,1,1))
forestPlot(mafCompareRes = pt.vs.rt, pVal = 0.1, color = c('royalblue', 'maroon'), geneFontSize = 0.8)

genes = c("PML", "RARA", "RUNX1", "ARID1B", "FLT3")
coOncoplot(m1 = primary.apl, m2 = relapse.apl, m1Name = 'PrimaryAPL', m2Name = 'RelapseAPL', genes = genes, removeNonMutated = TRUE) #plot the two coOncoplots side by side with the same genes 

lollipopPlot2(m1 = primary.apl, m2 = relapse.apl, gene = "PML", AACol1 = "amino_acid_change", AACol2 = "amino_acid_change", m1_name = "Primary", m2_name = "Relapse") #show the differences in a particualr gene using a lolipopPlot2
## 9 transcripts available. Use arguments refSeqID or proteinID to manually specify tx name.
##    HGNC refseq.ID protein.ID aa.length
## 1:  PML NM_033238  NP_150241       882
## 2:  PML NM_002675  NP_002666       633
## 3:  PML NM_033249  NP_150252       585
## 4:  PML NM_033247  NP_150250       435
## 5:  PML NM_033239  NP_150242       829
## 6:  PML NM_033250  NP_150253       781
## 7:  PML NM_033240  NP_150243       611
## 8:  PML NM_033244  NP_150247       560
## 9:  PML NM_033246  NP_150249       423
## Using longer transcript NM_033238 for now.
## 9 transcripts available. Use arguments refSeqID or proteinID to manually specify tx name.
##    HGNC refseq.ID protein.ID aa.length
## 1:  PML NM_033238  NP_150241       882
## 2:  PML NM_002675  NP_002666       633
## 3:  PML NM_033249  NP_150252       585
## 4:  PML NM_033247  NP_150250       435
## 5:  PML NM_033239  NP_150242       829
## 6:  PML NM_033250  NP_150253       781
## 7:  PML NM_033240  NP_150243       611
## 8:  PML NM_033244  NP_150247       560
## 9:  PML NM_033246  NP_150249       423
## Using longer transcript NM_033238 for now.

fab.ce = clinicalEnrichment(maf = laml_maf, clinicalFeature = 'FAB_classification') #enrichment analysis of a clinical feature in this case 'FAB_classification"
## Sample size per factor in FAB_classification:
## 
## M0 M1 M2 M3 M4 M5 M6 M7 
## 19 44 44 21 39 19  3  3
fab.ce$groupwise_comparision[p_value < 0.05] # Tavbe of significant gene mutations  
##    Hugo_Symbol Group1 Group2 n_mutated_group1 n_mutated_group2      p_value
## 1:        IDH1     M1   Rest         11 of 44         7 of 149 0.0002597371
## 2:        TP53     M7   Rest           3 of 3        12 of 190 0.0003857187
## 3:      DNMT3A     M5   Rest         10 of 19        38 of 174 0.0057610493
## 4:       CEBPA     M2   Rest          7 of 44         6 of 149 0.0117352110
## 5:       RUNX1     M0   Rest          5 of 19        11 of 174 0.0117436825
## 6:        NPM1     M5   Rest          7 of 19        26 of 174 0.0248582372
## 7:       CEBPA     M1   Rest          6 of 44         7 of 149 0.0478737468
##    OR_low   OR_high       fdr
## 1:      0 0.3926994 0.0308575
## 2:      0 0.1315271 0.0308575
## 3:      0 0.6406007 0.3072560
## 4:      0 0.6874270 0.3757978
## 5:      0 0.6466787 0.3757978
## 6:      0 0.8342897 0.6628863
## 7:      0 0.9869971 1.0000000
plotEnrichmentResults(enrich_res = fab.ce, pVal = 0.05) #plots enrichment results 

dgi = drugInteractions(maf = laml_maf, fontSize = 0.75) # looks for gene-drug interaction from the drug gene interaction database 

dnmt3a.dgi = drugInteractions(genes = "DNMT3A", drugs = TRUE) # known or reported drugs that interact with DNMT3A gene
## Number of claimed drugs for given genes:
##      Gene N
## 1: DNMT3A 7
dnmt3a.dgi[,.(Gene, interaction_types, drug_name, drug_claim_name)] # print these columns 
##      Gene interaction_types    drug_name drug_claim_name
## 1: DNMT3A                                            N/A
## 2: DNMT3A                   DAUNORUBICIN    Daunorubicin
## 3: DNMT3A                     DECITABINE      Decitabine
## 4: DNMT3A                     IDARUBICIN      IDARUBICIN
## 5: DNMT3A                     DECITABINE      DECITABINE
## 6: DNMT3A         inhibitor   DECITABINE   CHEMBL1201129
## 7: DNMT3A         inhibitor  AZACITIDINE      CHEMBL1489
OncogenicPathways(maf = laml_maf) #looks for enrichment of Oncogenic Signalling Pathways in TCGA
## Pathway alteration fractions
##        Pathway  N n_affected_genes fraction_affected
##  1:    RTK-RAS 85               18        0.21176471
##  2:      Hippo 38                7        0.18421053
##  3:      NOTCH 71                6        0.08450704
##  4:        MYC 13                3        0.23076923
##  5:        WNT 68                3        0.04411765
##  6:       TP53  6                2        0.33333333
##  7:       NRF2  3                1        0.33333333
##  8:       PI3K 29                1        0.03448276
##  9: Cell_Cycle 15                0        0.00000000
## 10:   TGF-Beta  7                0        0.00000000

PlotOncogenicPathways(maf = laml_maf, pathways = "RTK-RAS") #look at entire pathway 

library(BSgenome.Hsapiens.UCSC.hg19, quietly = TRUE) #get bases arouned mutated base and make matrix 
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
laml.tnm = trinucleotideMatrix(maf = laml_maf, prefix = 'chr', add = TRUE, ref_genome = "BSgenome.Hsapiens.UCSC.hg19") #Estimates APOBEC enrichment scores and makes matrix for signature analysis 
## Warning in trinucleotideMatrix(maf = laml_maf, prefix = "chr", add = TRUE, : Chromosome names in MAF must match chromosome names in reference genome.
## Ignorinig 101 single nucleotide variants from missing chromosomes chr23
## -Extracting 5' and 3' adjacent bases
## -Extracting +/- 20bp around mutated bases for background C>T estimation
## -Estimating APOBEC enrichment scores
## --Performing one-way Fisher's test for APOBEC enrichment
## ---APOBEC related mutations are enriched in  3.315 % of samples (APOBEC enrichment score > 2 ;  6  of  181  samples)
## -Creating mutation matrix
## --matrix of dimension 188x96
plotApobecDiff(tnm = laml.tnm, maf = laml_maf, pVal = 0.2) #Plots samples into APOBEC and non APOBEC and compares different genes 

## $results
##      Hugo_Symbol Enriched nonEnriched       pval        or      ci.up
##   1:        TP53        2          13 0.08175632 5.9976455  46.608861
##   2:        TET2        1          16 0.45739351 1.9407002  18.983979
##   3:        FLT3        2          45 0.65523131 1.4081851  10.211621
##   4:      DNMT3A        1          47 1.00000000 0.5335362   4.949499
##   5:      ADAM11        0           2 1.00000000 0.0000000 164.191472
##  ---                                                                 
## 132:         WAC        0           2 1.00000000 0.0000000 164.191472
## 133:         WT1        0          12 1.00000000 0.0000000  12.690862
## 134:      ZBTB33        0           2 1.00000000 0.0000000 164.191472
## 135:      ZC3H18        0           2 1.00000000 0.0000000 164.191472
## 136:      ZNF687        0           2 1.00000000 0.0000000 164.191472
##          ci.low adjPval
##   1: 0.49875432       1
##   2: 0.03882963       1
##   3: 0.12341748       1
##   4: 0.01101929       1
##   5: 0.00000000       1
##  ---                   
## 132: 0.00000000       1
## 133: 0.00000000       1
## 134: 0.00000000       1
## 135: 0.00000000       1
## 136: 0.00000000       1
## 
## $SampleSummary
##         Cohort SampleSize  Mean Median
## 1:    Enriched          6 7.167    6.5
## 2: nonEnriched        172 9.715    9.0
library('NMF') #load NMF library
## Loading required package: pkgmaker
## Loading required package: registry
## 
## Attaching package: 'pkgmaker'
## The following object is masked from 'package:S4Vectors':
## 
##     new2
## The following object is masked from 'package:base':
## 
##     isFALSE
## Loading required package: rngtools
## Loading required package: cluster
## NMF - BioConductor layer [OK] | Shared memory capabilities [NO: bigmemory] | Cores 3/4
##   To enable shared memory capabilities, try: install.extras('
## NMF
## ')
## 
## Attaching package: 'NMF'
## The following object is masked from 'package:S4Vectors':
## 
##     nrun
laml.sign = estimateSignatures(mat = laml.tnm, pConstant = 0.1, nTry = 4) #Estimate signatures
## -Running NMF for 4 ranks
## -Finished in 43.0s elapsed (7.355s cpu)

plotCophenetic(res = laml.sign) #Elbow bolot to see optimal number og signatures

laml.sig = extractSignatures(mat = laml.tnm, n = 3, pConstant = 0.1)
## -Running NMF for factorization rank: 3
## -Finished in2.127s elapsed (1.920s cpu)
laml.og30.cosm = compareSignatures(nmfRes = laml.sig, sig_db = "legacy") #compare with original 30 signatures
## -Comparing against COSMIC signatures
## ------------------------------------
## --Found Signature_1 most similar to COSMIC_1
##    Aetiology: spontaneous deamination of 5-methylcytosine [cosine-similarity: 0.84]
## --Found Signature_2 most similar to COSMIC_1
##    Aetiology: spontaneous deamination of 5-methylcytosine [cosine-similarity: 0.577]
## --Found Signature_3 most similar to COSMIC_5
##    Aetiology: Unknown [cosine-similarity: 0.851]
## ------------------------------------
laml.v3.cosm = compareSignatures(nmfRes = laml.sig, sig_db = "SBS") #compare with 60 signatures
## -Comparing against COSMIC signatures
## ------------------------------------
## --Found Signature_1 most similar to SBS1
##    Aetiology: spontaneous or enzymatic deamination of 5-methylcytosine [cosine-similarity: 0.858]
## --Found Signature_2 most similar to SBS6
##    Aetiology: defective DNA mismatch repair [cosine-similarity: 0.538]
## --Found Signature_3 most similar to SBS3
##    Aetiology: Defects in DNA-DSB repair by HR [cosine-similarity: 0.836]
## ------------------------------------
library('pheatmap') #load pheatmap
pheatmap::pheatmap(mat = laml.og30.cosm$cosine_similarities, cluster_rows = FALSE, main = "cosine similarity against validated signatures") #compares simillarities of signatures with confirmed signatures 

maftools::plotSignatures(nmfRes = laml.sig, title_size = 0.8) #Plot signatures

laml.se = signatureEnrichment(maf = laml_maf, sig_res = laml.sig) # Assign to sample and perform enrichment analysis that looks at muatations that are enriched at every signature 
## Running k-means for signature assignment..
## Performing pairwise and groupwise comparisions..
## Sample size per factor in Signature:
## 
## Signature_1 Signature_2 Signature_3 
##          60          65          63
## Estimating mutation load and signature exposures..

plotEnrichmentResults(enrich_res = laml.se, pVal = 0.05) #Plot enrichment results

Reference

Mayakonda, Anand, De-Chen Lin, Yassen Assenov, Christoph Plass, and H. Phillip Koeffler. 2018. “Maftools: Efficient and Comprehensive Analysis of Somatic Variants in Cancer.” Genome Research 28 (11). Cold Spring Harbor Laboratory Press: 1747–56. https://doi.org/10.1101/gr.239244.118.

Tamborero, David, Abel Gonzalez-Perez, and Nuria Lopez-Bigas. 2013. “OncodriveCLUST: Exploiting the Positional Clustering of Somatic Mutations to Identify Cancer Genes.” Bioinformatics 29 (18): 2238–44. https://doi.org/10.1093/bioinformatics/btt395.